Write a short description about the course and add a link to your GitHub repository here. This is an R Markdown (.Rmd) file so you should use R Markdown syntax.
Testi
# This is a so-called "R chunk" where you can write R code.
date()
## [1] "Fri Nov 13 14:39:18 2020"
The text continues here.
Describe the work you have done this week and summarize your learning.
lrn2014 <- read.csv("C:/Users/Juhana/Documents/IODS-project-1/data/learning2014.csv", row.names = 1)
alcGit <- read.csv("https://github.com/JunzQ7/IODS-project/tree/master/data/alc.csv")
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
p <- ggpairs(lrn2014, mapping = aes(), lower = list(combo = wrap("facethist", bins = 20)))
p
model = lm(Points~Attitude+stra+surf, data = lrn2014)
summary(model)
##
## Call:
## lm(formula = Points ~ Attitude + stra + surf, data = lrn2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1550 -3.4346 0.5156 3.6401 10.8952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.01711 3.68375 2.991 0.00322 **
## Attitude 0.33952 0.05741 5.913 1.93e-08 ***
## stra 0.85313 0.54159 1.575 0.11716
## surf -0.58607 0.80138 -0.731 0.46563
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.296 on 162 degrees of freedom
## Multiple R-squared: 0.2074, Adjusted R-squared: 0.1927
## F-statistic: 14.13 on 3 and 162 DF, p-value: 3.156e-08
model2 = lm(Points~Attitude, data = lrn2014)
summary(model2)
##
## Call:
## lm(formula = Points ~ Attitude, data = lrn2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.9763 -3.2119 0.4339 4.1534 10.6645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.63715 1.83035 6.358 1.95e-09 ***
## Attitude 0.35255 0.05674 6.214 4.12e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.32 on 164 degrees of freedom
## Multiple R-squared: 0.1906, Adjusted R-squared: 0.1856
## F-statistic: 38.61 on 1 and 164 DF, p-value: 4.119e-09
plot(model2, which = c(1,2,5))
alc <- read.csv("data/alc.csv", row.names = 1)
Selected variables: sex, absences, internet, activities
library(ggplot2)
g1 <- ggplot(alc, aes(x = high_use, y = absences))
g1 + geom_boxplot()
(prop.table(table(alc$sex, alc$high_use), margin = 1))
##
## FALSE TRUE
## F 0.7878788 0.2121212
## M 0.6086957 0.3913043
m <- glm(high_use ~ absences + sex + internet + activities, data = alc, family = "binomial")
summary(m)
##
## Call:
## glm(formula = high_use ~ absences + sex + internet + activities,
## family = "binomial", data = alc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3480 -0.8517 -0.6163 1.1187 2.0931
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.68510 0.35648 -4.727 2.28e-06 ***
## absences 0.09672 0.02315 4.178 2.95e-05 ***
## sexM 1.02724 0.24411 4.208 2.57e-05 ***
## internetyes 0.02369 0.33998 0.070 0.944
## activitiesyes -0.38679 0.23891 -1.619 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 427.43 on 377 degrees of freedom
## AIC: 437.43
##
## Number of Fisher Scoring iterations: 4
coef(m)
## (Intercept) absences sexM internetyes activitiesyes
## -1.68509737 0.09672074 1.02724315 0.02369222 -0.38678774
probabilities <- predict(m, type = "response")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# add the predicted probabilities to 'alc'
alc <- mutate(alc, probability = probabilities)
# use the probabilities to make a prediction of high_use
alc <- mutate(alc, prediction = probability > 0.5)
# tabulate the target variable versus the predictions
table(high_use = alc$high_use, prediction = alc$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 257 11
## TRUE 88 26
# access dplyr and ggplot2
library(dplyr); library(ggplot2)
# initialize a plot of 'high_use' versus 'probability' in 'alc'
g <- ggplot(alc, aes(x = probability, y = high_use, col = prediction))
# define the geom as points and draw the plot
g + geom_point()
# tabulate the target variable versus the predictions
table(high_use = alc$high_use, prediction = alc$prediction) %>% prop.table %>% addmargins
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.67277487 0.02879581 0.70157068
## TRUE 0.23036649 0.06806283 0.29842932
## Sum 0.90314136 0.09685864 1.00000000
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
boston_scaled = scale(Boston)
str(boston_scaled)
## num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:506] "1" "2" "3" "4" ...
## ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(boston_scaled)
# choose randomly 80% of the rows
ind <- sample(n, size = n * 0.8)
# create train set
train <- boston_scaled[ind,]
# create test set
test <- boston_scaled[-ind,]
# save the correct classes from test data
correct_classes <- test$crime
# remove the crime variable from test data
test <- dplyr::select(test, -crime)
# LDA
lda.fit <- lda(crime ~ ., data = train)
# print the lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2648515 0.2500000 0.2524752 0.2326733
##
## Group means:
## zn indus chas nox rm age
## low 0.97158426 -0.9599453 -0.12514775 -0.9020608 0.43687291 -0.8884058
## med_low -0.06653482 -0.3089421 -0.03844192 -0.5644652 -0.08142222 -0.3802503
## med_high -0.36785679 0.1959520 0.22945822 0.4353902 0.13514308 0.4274404
## high -0.48724019 1.0172896 -0.14667693 1.0170909 -0.37162929 0.8103798
## dis rad tax ptratio black lstat
## low 0.8957949 -0.6920711 -0.7123074 -0.48237641 0.37701997 -0.78089619
## med_low 0.3453090 -0.5418150 -0.4504077 -0.07595779 0.32825882 -0.18599640
## med_high -0.3856212 -0.4020080 -0.3005790 -0.36121195 0.07567687 0.06023923
## high -0.8461426 1.6363892 1.5128120 0.77875205 -0.80524897 0.79632665
## medv
## low 0.49069664
## med_low 0.03314276
## med_high 0.17956820
## high -0.63315929
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.10370719 0.64781155 -0.93131878
## indus 0.07193844 -0.52660054 0.34914942
## chas -0.08806524 -0.09837703 0.04445874
## nox 0.41691658 -0.68410797 -1.27575067
## rm -0.08873134 -0.13626175 -0.16449397
## age 0.23746166 -0.23530082 -0.25111561
## dis -0.03598091 -0.30312595 0.02921287
## rad 3.19201223 0.74005044 -0.13677099
## tax -0.04637429 0.33491159 0.54341675
## ptratio 0.13937557 0.02441220 -0.21080271
## black -0.16510102 0.02525885 0.15853548
## lstat 0.26555334 -0.37318313 0.32018141
## medv 0.23700719 -0.48666180 -0.13153044
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9433 0.0452 0.0115
# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "orange", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
# target classes as numeric
classes <- as.numeric(train$crime)
# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
## predicted
## correct low med_low med_high high
## low 9 8 3 0
## med_low 1 21 3 0
## med_high 0 12 11 1
## high 0 0 0 33
# load MASS and Boston
library(MASS)
library(ggplot2)
data('Boston')
Boston <- scale(Boston)
# euclidean distance matrix
dist_eu <- dist(Boston)
# look at the summary of the distances
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
# manhattan distance matrix
dist_man <- dist(Boston, method = 'manhattan')
# look at the summary of the distances
summary(dist_man)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2662 8.4832 12.6090 13.5488 17.7568 48.8618
# k-means clustering
km <-kmeans(Boston, centers = 3)
# plot the Boston dataset with clusters
pairs(Boston, col = km$cluster)
set.seed(123)
# determine the number of clusters
k_max <- 10
# calculate the total within sum of squares
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})
# visualize the results
qplot(x = 1:k_max, y = twcss, geom = 'line')
data(Boston)
Boston <- as.data.frame(scale(Boston))
km <- kmeans(Boston, centers = 5)
Boston$km <- km$cluster
lda.fit <- lda(km~., Boston)
classes <- as.numeric(Boston$km)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)
library(MASS)
data(Boston)
boston_scaled = scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(boston_scaled)
ind <- sample(n, size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
correct_classes <- test$crime
test <- dplyr::select(test, -crime)
# LDA
lda.fit <- lda(crime ~ ., data = train)
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = as.numeric(train$crime))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.